/*******************************************************************************
********************************************************************************
************************* ZOC Project Data Build *******************************
********************************************************************************
********************************************************************************
* Primary Author: Chris Campos
*
* Originated : 09-25-2019 
*
* Last updated: 09/2023
*
* Runtime: 240 minutes 
* *****************************************************************************/
timer clear 
timer on 1

set trace on
set tracedepth 2
set matsize 10000


capture program drop main
program define main

  paths, user("cqcampos")

  local data "$datadir/lausd2002_2021.dta"

  eleventhGradeLAUSD, data(`data')
  *** Estimate Value Added Models ***
  match_model_estimate, estimation("dummy") testtype("z_ela_all")
  match_model_estimate, estimation("dummy") testtype("z_math_all") 
  *** Construct match and ability measures ****
  calculateMatchEffects, estimation("dummy") testtype("z_ela_all")
  calculateMatchEffects, estimation("dummy") testtype("z_math_all")
  *** Propensity Score Estimation and creating matched set ***
  pWeightsStats, estimation("dummy") testtype("z_ela_all")
  pWeightsStats, estimation("dummy") testtype("z_math_all")
  matchedSet, estimation("dummy") testtype("z_ela_all")
  matchedSet, estimation("dummy") testtype("z_math_all")
  
  *** Clean rank-ordered list data and calculate OVG ***
   cleanChoiceData
  * Estimate OVG 
  ovg 

  *** Put together into an analysis dataset for event-studies ***
  prepEventStudy, subjectMath("FALSE")


  * Estimate deltas (mean utilities) for analysis
  estimateDeltasByYear
  estimateDeltasByYearRobustness

  * Build college sample
  cleanNSC 
  buildCollegeSample
  

  * ITT sample
  local analysis_ela "$datadir/zoc_analysis_data_ela"
  buildITT, data(`analysis_ela')
  buildCollegeSampleITT


  * School-level covariates for demand analysis 
  cleanTeacherData

  schoolAttributeAggregation

  * Additional intermediate outcomes 
  constructBehavior

end



capture program drop paths
program define paths
syntax, [user(string)]
  
 
  global dir "DIRECTORY1"
  global buildir "DIRECTORY2"
  global grades "$buildir/grades"

  global tables $dir/tables
  global codeR $dir/codeR
  * Where to save clean data
  global datadir $dir/data
  global intermediate  $dir/data/intermediate 
  global rawdata $dir/rawdata
  global figures $dir/figures
  global logs $dir/logs


end
paths

****************************************************************************
* Make eleventh-grade sample of LAUSD students
*
* This is the sample of students for the achievement-based analysis 
*
****************************************************************************
capture program drop eleventhGradeLAUSD
program define eleventhGradeLAUSD
 syntax, [data(string)]
  import delimited "$rawdata/zoc_high_schools_list_for_choice.csv", clear
  tempfile zoc_schools
  save `zoc_schools'

  use `data', clear

  * drop unnecessary variables for memory purposes
  drop gpaFALL_ELA numCoursesFALL_ELA numAPCoursesFALL_ELA numHonorsCoursesFALL_ELA gpaFALL_MATH numCoursesFALL_MATH numAPCoursesFALL_MATH numHonorsCoursesFALL_MATH gpaFALL_SCI numCoursesFALL_SCI numAPCoursesFALL_SCI numHonorsCoursesFALL_SCI gpaFALL_SS numCoursesFALL_SS numAPCoursesFALL_SS numHonorsCoursesFALL_SS gpaSPRING_ELA numCoursesSPRING_ELA numAPCoursesSPRING_ELA numHonorsCoursesSPRING_ELA gpaSPRING_MATH numCoursesSPRING_MATH numAPCoursesSPRING_MATH numHonorsCoursesSPRING_MATH gpaSPRING_SCI numCoursesSPRING_SCI numAPCoursesSPRING_SCI numHonorsCoursesSPRING_SCI gpaSPRING_SS numCoursesSPRING_SS numAPCoursesSPRING_SS numHonorsCoursesSPRING_SS ytd_ap ytd_honors
  drop sat_math sat_write sat_verbal mergeSAT agstatusdescCSU agstatustypecodeCSU agcreditscompleteCSU agstatusdescUC agstatustypecodeUC agcreditscompleteUC
  drop localdistrictcode studentclassofname birthcountry testnameMATH testnameELA


  * Tag ZOC high schools and fix codes 
  merge m:1 preferredlocationcode using `zoc_schools', gen(mergeZOC)
  replace preferredlocationcode = changepreferredto if changepreferredto !=.

  * Identify min and max grade the school teaches (allows us to keep high schools)
  gen zoc_hs = mergeZOC == 3
  bys preferredlocationcode: egen minGradeS = min(gradecode)
  bys preferredlocationcode: egen maxGradeS = max(gradecode)

  * Going to control for mean middle school grade scores in VA models
  gen lagscoreE = z_ela_all if gradecode==8 | gradecode==7 | gradecode ==6
  gen lagscoreM = z_math_all  if gradecode==8 | gradecode==7 | gradecode ==6
  bys studentpseudoid : egen lagE = mean(lagscoreE)
  bys studentpseudoid : egen lagM = mean(lagscoreM)

  * make cubics in lag scores
  gen lagscoreE_2 = lagE^2
  gen lagscoreE_3 = lagE^3
  gen lagscoreM_2 = lagM^2
  gen lagscoreM_3 = lagM^3


  gen missingLagE = missing(lagE)
  gen missingLagM = missing(lagM)

  replace lagE = 0 if missing(lagE)
  replace lagM = 0 if missing(lagM)
  replace lagscoreE_2 = 0 if missing(lagscoreE_2)
  replace lagscoreE_3 = 0 if missing(lagscoreE_3)
  replace lagscoreM_2 = 0 if missing(lagscoreM_2)
  replace lagscoreM_3 = 0 if missing(lagscoreM_3)


  * drop student-year duplicates 
  duplicates drop studentpseudoid endyear , force

  * Make a few additional controls
  * number of suspensions in middle school
  gen lagSuspensions = incidents if gradecode==8 | gradecode==7 | gradecode==6
  bys studentpseudoid: egen numSuspensions = total(lagSuspensions)
  replace numSuspensions=0 if numSuspensions==.
  drop incidents

  * keep those that serve high school students
  keep if maxGradeS==12 | maxGradeS==11
  * keep only relevant scores 
  keep if gradecode>=9 & gradecode<=11


  egen school_year = group(preferredlocationcode endyear)
  replace parentedulevelname = 2 if parentedulevelname ==.
  tab parentedulevelname , gen(parent_e)
  tab ethnicity, gen(eth_)
  bys preferredlocationcode: egen minSchoolYear = min(endyear)

  gen hispanic = eth_5==1
  gen black = eth_3==1
  gen white = eth_10 ==1
  gen other = hispanic ==0 & black==0 & white==0

  * Restricting to students in eleventh grade (for VA estimation)
  keep if gradecode==11
  label var lagE "Lagged 6th, 7th, 8th grade ELA scores"
  label var lagM "Lagged 6th, 7th, 8th grade Math scores"
  label var lagscoreE_2 "Squared lagged 6,7,8 grade ELA scores"
  label var lagscoreE_3 "Cubed lagged 6,7,8 grade ELA scores"
  label var lagscoreM_2 "Squared lagged 6,7,8 grade Math scores"
  label var lagscoreM_3 "Cubed lagged 6,7,8 grade Math scores"
  label var hispanic "Indicator for hispanic student"
  label var black "Indicator for black student"
  label var white "Indicator for white student"
  label var other "Indicator for non-black, hispanic, or white student"


  save $datadir/lausd_2002_2021_11thgrade.dta, replace
end

****************************************************************************
* match_model_estimate 
*
* Description: Estimates generalized value-added model 
*
* estimation(string) - estimation approach
* testtype(string) - exam, either Math or ELA 
*
****************************************************************************
capture program drop match_model_estimate
program define match_model_estimate
syntax, [estimation(string) testtype(string)]
  clear
  set matsize 10000
  set max_memory 128g, permanently
  timer on 1 

  use $datadir/lausd_2002_2021_11thgrade.dta, clear

  * drop years where no exam was taken (or no reliable test data)
  drop if endyear==2014 | endyear>2019

  * Fix some issues with school codes and naming 
  replace preferredlocationcode=8829 if regexm(schoollocationname, "ROOSEVELT")
  replace preferredlocationcode=8611 if regexm(schoollocationname, "MENDEZ")
  replace preferredlocationcode=7722 if regexm(schoollocationname, "ANGELOU")
  replace preferredlocationcode=8544 if regexm(schoollocationname, "ROYBAL")
  replace preferredlocationcode=8543 if regexm(schoollocationname, "BELMONT")
  replace preferredlocationcode=8729 if regexm(schoollocationname, "LINCOLN")
  replace preferredlocationcode=8721 if regexm(schoollocationname, "JORDAN")

  keep if endyear >2002
  * Number of students at school (will use to condition to schools with at least a minimum number of students)
  bys preferredlocationcode  endyear: gen numStudents=_N
  drop if numStudents <15 


  gen missingTestScore = missing(`testtype')
  replace `testtype' = 0 if missingTestScore==1

  * school by year dummies 
  levelsof preferredlocationcode, local(schools)
  foreach s of local schools{
    gen dummy`s' = preferredlocationcode==`s'
    levelsof endyear if preferredlocationcode==`s', local(schoolyears)
    foreach year of local schoolyears{
      gen sydummy_`s'_`year' = (preferredlocationcode==`s' & endyear==`year')
    }
  }


  set emptycells drop
  set matsize 10000
  reghdfe, compile
  ftools, compile

  local vars "female poverty migrant black  hispanic college english_learner  spanish_at_home numSuspensions lagE  lagM"
  foreach var of local vars{
    gen missing_`var' = missing(`var')
    replace `var' = 0 if missing(`var')
    egen tmpvar = mean(`var')
    gen model_`var' = `var' - tmpvar 
    drop tmpvar 
  }
  gen model_lagscoreE_2 = model_lagE^2 
  gen model_lagscoreE_3 = model_lagE^3
  gen model_lagscoreM_2 = model_lagM^2
  gen model_lagscoreM_3 = model_lagM^3

  if "`estimation'"=="dummy"{
      local nameappend "_dummy"
      reg `testtype' c.(dummy* )#c.(  model_female model_poverty model_migrant model_black  model_hispanic  ///
        model_college model_english_learner  model_spanish_at_home model_numSuspensions model_lagE  model_lagM  )  ///
        model_lagscoreM_2 model_lagscoreM_3 model_lagscoreE_2 model_lagscoreE_3 missing_lagE missing_lagM  ///
        sydummy* if missingTestScore!=1 , ///
        vce(robust ) nocons
        predict ehat, residuals 
        local nameappend "_dummy"
        save $datadir/match_model_estimates_and_output_`testtype'`nameappend', replace 
  }

  parmest, norestore
  save $datadir/match_model_coef_estimates_raw_`testtype'`nameappend', replace 


  ***** do match coefficients first ***** 
  * tag estimates that are part of match and estimates that are school-year dummies 
  gen match_coef_tag = regexm(parm, "c")
  keep if match_coef_tag==1
  * drop the unrelated coefs 
  drop if inlist(parm, "model_lagscoreM_2", "model_lagscoreM_3", "model_lagscoreE_2", "model_lagscoreE_3", "missingTestScore")
  * dropped coefs 
  drop if substr(parm,1,2)=="co"

  gen preferredlocationcode = substr(parm,8,4) if match_coef_tag==1

  split parm, parse("#")
  replace parm2 = subinstr(parm2, "c.", "", .)

  keep preferredlocationcode estimate parm2
  destring preferredlocationcode, replace
  levelsof parm2, clean  local(chars)
  rename parm2 char

  reshape wide estimate , i(preferredlocationcode) j(char) string
  foreach char of local chars{
    egen mean`char' = mean(estimate`char')
    gen match`char' = estimate`char' - mean`char'
    * keep betabar for each char
    *rename mean`char'_school_coef
    label var mean`char' "Average School Effect"
    label var match`char' "School effect relative to average school"
  }
  * fix missings 
  #delimit ;
  local vars estimatemodel_black estimatemodel_college estimatemodel_english_learner 
              estimatemodel_female estimatemodel_hispanic estimatemodel_lagE estimatemodel_lagM 
              estimatemodel_migrant estimatemodel_numSuspensions estimatemodel_poverty 
              estimatemodel_spanish_at_home meanmodel_black matchmodel_black
              meanmodel_college matchmodel_college meanmodel_english_learner 
              matchmodel_english_learner meanmodel_female matchmodel_female meanmodel_hispanic 
              matchmodel_hispanic meanmodel_lagE matchmodel_lagE meanmodel_lagM matchmodel_lagM 
              meanmodel_migrant matchmodel_migrant meanmodel_numSuspensions matchmodel_numSuspensions 
              meanmodel_poverty matchmodel_poverty meanmodel_spanish_at_home matchmodel_spanish_at_home ;
  #delimit cr 
  foreach var of local vars{
    replace `var' = 0 if missing(`var')
  }

  save $datadir/match_coefficients_clean_`testtype'`nameappend', replace 


  **** Clean school-by-year effects ****
  use $datadir/match_model_coef_estimates_raw_`testtype'`nameappend', clear 
  gen match_coef_tag = regexm(parm, "c")
  keep if match_coef_tag==0
  split parm , parse("_")
  drop parm1 
  rename (parm2 parm3) (preferredlocationcode endyear)
  keep preferredlocationcode endyear estimate stderr
  drop if preferredlocationcode=="lagE" | preferredlocationcode=="lagM"
  destring preferredlocationcode endyear, replace 
  * demeand within year 
  bys endyear: egen meanVAYear = mean(estimate) 
  gen vaestimate = estimate - meanVAYear
  drop meanVAYear
  save $datadir/va_coefficients_clean_`testtype'`nameappend', replace 
  timer off 1 
  timer list 1 

end 



****************************************************************************
* calculateMatchEffects
*
* Description: Takes output from estimation and construct measures of 
* match quality, ability
****************************************************************************
capture program drop calculateMatchEffects 
program define calculateMatchEffects 
syntax, [estimation(string) testtype(string)]
  if "`estimation'"=="dummy"{
    local nameappend "_dummy"
  }
  use $datadir/match_model_estimates_and_output_`testtype'`nameappend', clear 
  
  * Create school-level averages among 11th-grade students 
  keep preferredlocationcode endyear  sped female poverty migrant black other white hispanic college english_learner  spanish_at_home numSuspensions lagE lagscoreE_2 lagscoreE_3 lagM lagscoreM_2 lagscoreM_3 model_*
  collapse (mean)  sped female poverty migrant black white other hispanic college english_learner  spanish_at_home numSuspensions lagE lagscoreE_2 lagscoreE_3 lagM lagscoreM_2 lagscoreM_3 model_*, by(preferredlocationcode endyear)
  save $datadir/va_match_model_post_`testtype'`nameappend'.dta, replace

  *********************************************************
  *********************************************************
  * construct predictions for ability, match, and ATE
  *********************************************************
  *********************************************************

  * merge in relevant stuff
  use $datadir/match_model_estimates_and_output_`testtype'`nameappend', clear 
  drop sydummy* dummy* 
  * merge betas 
  merge m:1 preferredlocationcode using $datadir/match_coefficients_clean_`testtype'`nameappend', gen(mergeBetas)
  * merge alphas 
  merge m:1 preferredlocationcode endyear using $datadir/va_coefficients_clean_`testtype'`nameappend', gen(mergeAlphas)

  * school-by-year value added (ATEs)
  rename vaestimate ATE

  * construct match quality
  local match_f ""
  local xs "lagE lagM black  english_learner female hispanic migrant college  poverty spanish_at_home  numSuspensions "
  foreach char of local xs{
    replace matchmodel_`char' = 0 if missing(matchmodel_`char')
    replace `char' = 0 if missing(`char')
    if "`char'"=="lagE" local match_f "matchmodel_`char'*(model_`char')"
    else local match_f "`match_f' +matchmodel_`char'*(model_`char')"
  }
  display "`match_f'"
  gen match = `match_f'

  * ability is the residual of yhat wrt to ATE and match component
  gen yhat = `testtype' - ehat
  gen ability = yhat - ATE - match
  preserve 
    keep studentpseudoid endyear yhat ability ATE match 
    save $datadir/student_match_data_`testtype'`nameappend'.dta, replace 
  restore
  rename estimate school_effect_estimate
  decode preferredlocationname, gen(name)
  bys preferredlocationcode: egen modeName = mode(name) , maxmode
  * create school level analog for event studies
  collapse (mean) school_effect_estimate  ability yhat ATE match  lagE lagM black  english_learner female hispanic migrant college  poverty spanish_at_home  numSuspensions (count) numStudents = studentpseudoid , by(preferredlocationcode modeName endyear )
  order modeName, after(preferredlocationcode)
  rename modeName name


  save $datadir/match_decomposition_school_data_`testtype'`nameappend'.dta, replace

end



****************************************************************************
* Estimate school-level propensity scores of ZOC in 2012
*
* Will use to create matched sample 
*
****************************************************************************
capture program drop pWeightsStats
program define pWeightsStats
syntax, [data(string) traditional(string) estimation(string) testtype(string)]
  if "`estimation'"=="dummy"{
    local nameappend "_dummy"
  }

  import delimited "$rawdata/zoc_high_schools_list.csv", clear
  tempfile zoc_schools
  save `zoc_schools'

  use $datadir/match_decomposition_school_data_`testtype'`nameappend'.dta, clear
  merge m:1 preferredlocationcode using `zoc_schools'
  replace preferredlocationcode = changepreferredto if changepreferredto !=.
  * keep the pre-intervention year obs only
  keep if endyear == 2012
  gen zoc_hs = zone!=""
  sum zoc_hs
  local pi = r(mean)


  * Relabel variables for regression output
  local chars "lagE lagM black hispanic  english_learner female migrant spanish_at_home poverty college numSuspensions numStudents"
  foreach char of local chars{
    if "`char'" == "ATE" local rowname "School Value Added"
    if "`char'" =="lagE" local rowname "Incoming ELA Scores"
    if "`char'" =="lagM" local rowname "Incoming Math Scores"
    if "`char'" =="yhat" local rowname "Incoming ELA"
    if "`char'" =="black" local rowname "Black"
    if "`char'" =="hispanic" local rowname "Hispanic"
    if "`char'" =="white" local rowname "White"
    if "`char'" =="english_learner" local rowname "English Learner"
    if "`char'" =="sped" local rowname "Special Education"
    if "`char'" =="female" local rowname "Female"
    if "`char'" =="migrant" local rowname "Migrant"
    if "`char'" =="spanish_at_home" local rowname "Spanish at home"
    if "`char'" =="poverty" local rowname "Poverty"
    if "`char'" =="college" local rowname "Parents College +"
    if "`char'" =="numSuspensions" local rowname "Incoming Suspensions"
    if "`char'" =="numStudents" local rowname "Incoming Cohort Size"
    label var `char' "`rowname'"
  }

  logit zoc_hs  yhat black english_learner female hispanic migrant college poverty spanish_at_home numSuspensions numStudents, noconstant   vce(robust)
  outreg2 using $tables/propensity_score_model_output.tex, replace tex label
  logit zoc_hs yhat black english_learner female hispanic migrant college poverty spanish_at_home numSuspensions numStudents, noconstant   vce(robust)
  outreg2 using $tables/propensity_score_model_output.tex, append tex label
  logit zoc_hs black english_learner female hispanic migrant college poverty spanish_at_home numSuspensions numStudents, noconstant   vce(robust)
  outreg2 using $tables/propensity_score_model_output.tex, append tex label
  logit zoc_hs ATE lagE lagM black english_learner female hispanic migrant college poverty spanish_at_home numSuspensions numStudents , noconstant   vce(robust)
  predict xb, xb
  gen phat = exp(xb)/(1 + exp(xb))
  gen outliers = (phat<.05) | phat>.95
 save $datadir/pweights.dta, replace  
  * look at overlap
  twoway (hist phat if zoc_hs==1 , lcolor(navy) fcolor(none) frac width(.1)) ///
    (hist phat if zoc_hs ==0 , lcolor(maroon) fcolor(none) frac  width(.1)) if outliers==0, ///
    legend(order(1 "ZOC" 2 "Non-ZOC")) ///
    xtitle("Estimated Propensity Score")
  graph export $figures/propensity_score_overlap.pdf, replace


  * Make a balance table between treated and non-treated
  import delimited "$rawdata/zoc_high_schools_list_for_choice.csv", clear
  tempfile zoc_schools
  save `zoc_schools'

  use $datadir/match_decomposition_school_data_`testtype'`nameappend'.dta, clear
  merge m:1 preferredlocationcode using `zoc_schools'
  replace preferredlocationcode = changepreferredto if changepreferredto !=.
  keep if endyear ==2012
  gen zoc_hs = zone!=""
  gen treat = zoc_hs==1


  local chars "ATE yhat black hispanic  english_learner  female migrant spanish_at_home poverty college numSuspensions numStudents"
  texdoc init $tables/school_balance_final.tex, replace force
  texdoc write \begin{table}
  texdoc write \caption{School-level Balance}
  texdoc write \begin{tabular}{lccc} \hline \hline
  texdoc write  & (1) & (2) & (3)  \\
  texdoc write  & ZOC & Non-ZOC & Difference  \\ \hline \\
  foreach char of local chars {

    if "`char" !="numStudents"{
      reghdfe `char' treat [aweight=numStudents], absorb(endyear) vce(cluster preferredlocationcode)
    }
    else{
      reghdfe `char' treat , absorb(endyear) vce(cluster preferredlocationcode)
    }
    local control = round(_b[_cons], .001)
    local treat = round(_b[_cons] + _b[treat], .001)

    local df = e(df_r)
    local diff = round(_b[treat], .001)
    local diff_se = round(_se[treat], .001)
    local diff_p = (2*ttail(`df', abs(_b[treat]/_se[treat]) ) )
    local diff_star ""
    if `diff_p ' <=.01 local diff_star "***"
    else if `diff_p '<=.05 & `diff_p ' >.01 local diff_star "**"
    else if `diff_p '<=.10 & `diff_p ' >.05 local diff_star "*"
    count if e(sample) ==1 & treat==0
    local N_c= r(N)
    count if e(sample) ==1 & treat==1
    local N_t= r(N)

    if "`char'" == "ATE" local rowname "School Value Added"
    if "`char'" =="yhat" local rowname "Incoming ELA"
    if "`char'" =="black" local rowname "Black"
    if "`char'" =="hispanic" local rowname "Hispanic"
    if "`char'" =="white" local rowname "White"
    if "`char'" =="english_learner" local rowname "English Learner"
    if "`char'" =="sped" local rowname "Special Education"
    if "`char'" =="female" local rowname "Female"
    if "`char'" =="migrant" local rowname "Migrant"
    if "`char'" =="spanish_at_home" local rowname "Spanish at home"
    if "`char'" =="poverty" local rowname "Poverty"
    if "`char'" =="college" local rowname "Parents College +"
    if "`char'" =="numSuspensions" local rowname "Incoming Suspensions"
    if "`char'" =="numStudents" local rowname "Incoming Cohort Size"

    texdoc write `rowname' & `treat' & `control' & `diff'`diff_star'  \\
    texdoc write & & & (`diff_se') \\
  }
  texdoc write & & & \\
  texdoc write Schools & `N_t' & `N_c' &  \\ \hline \hline
  texdoc write \end{tabular}
  texdoc write \end{table}
  texdoc close

end

****************************************************************************
* Create matched set of schools
*
* For each school, pick a counterfactual that is similar in terms of
* 1. Hispanic Share
* 2. Poverty share
*
* If there are multiple, pick the one with the closest propensity score
****************************************************************************
capture program drop matchedSet
program define matchedSet
syntax, [testtype(string) estimation(string)]
  clear 
  tempfile matched_set
  save `matched_set', emptyok
  if "`estimation'"=="dummy"{
    local nameappend "_dummy"
  }

  import delimited "$rawdata/zoc_high_schools_list.csv", clear
  tempfile zoc_schools
  save `zoc_schools'

  use $datadir/match_decomposition_school_data_`testtype'`nameappend'.dta, clear
  merge m:1 preferredlocationcode using `zoc_schools'

  * Make sure that we exclude  community day schools (CDS) or other continuation schools or magnet campuses
  rename name prefname 
  drop if regexm(prefname, "CDS")
  drop if regexm(prefname, "OPP")
  drop if regexm(prefname, "CONTN")
  drop if regexm(prefname, "ANGEL'S GATE HS")
  drop if regexm(prefname, "SP ED")
  drop if regexm(prefname, "MAG")
  drop if regexm(prefname, "DBM")
  replace prefname = proper(prefname )

  gen continuation = 1 if regexm(prefname, "Boyle Heights HS")
  replace continuation = 1 if regexm(prefname, "San Antonio")
  replace continuation = 1 if regexm(prefname, "Stoney Point")
  replace continuation = 1 if regexm(prefname, "Mt Lukens")
  replace continuation = 1 if regexm(prefname, "Highland Park")
  replace continuation = 1 if regexm(prefname, "Cheviot Hills")
  replace continuation = 1 if regexm(prefname, "View Park")
  replace continuation = 1 if regexm(prefname, "Aggeler")
  local schools Boyle Heights ///
            Metropolitan ///
            Monterey ///
            Odyssey ///
            Pueblo ///
            Ramona ///
            Rodia ///
            Addams ///
            Einstein ///
            Grey ///
            Independence ///
            Owensmouth ///
            Thoreau ///
            Wooden ///
            Earhart ///
            Burke ///
            Evergreen ///
            Lewis ///
            London ///
            Mission ///
            Rogers ///        
            Central ///
            Kahlo ///
            Avalon ///
            Hope ///
            Moneta ///
            Patton  ///
            Ellington ///
            Phoenix ///
            Whitman ///
            Young 
  foreach school of local schools{
    replace continuation = 1 if regexm(prefname, "`school'")
  }
  replace continuation= 0 if missing(continuation)
  replace continuation =1 if  regexm(prefname, "CYESIS") | regexm(prefname, "Cyesis") 
  drop if continuation  ==1

  bys preferredlocationcode : egen minYear = min(endyear)
  bys preferredlocationcode : egen maxYear = max(endyear)
  keep if endyear == 2012
  merge m:1 preferredlocationcode using $datadir/pweights.dta, gen(mergePweights) keep( 1 3)

  xtile his_decile = hispanic , nquantiles(5)
  xtile pov_decile = poverty , nquantiles(5)
  xtile coll_decile = college , nquantiles(5)

  * list of non-zoc schools
  preserve
    keep if zone==""
    tempfile nzoc
    save `nzoc'
  restore


  tempfile data 
  save `data'
  levelsof preferredlocationcode if zone!="", local(zoc)
  foreach z of local zoc{
    di "Working on school `z'"
    use `data', clear 
    sum phat if preferredlocationcode == `z'
    local pscore = r(mean)
    sum hispanic if preferredlocationcode == `z'
    local his = r(mean)
    sum poverty if preferredlocationcode == `z'
    local pov = r(mean)
    sum college if preferredlocationcode == `z'
    local coll = r(mean)
    sum lagE if preferredlocationcode==`z'
    local lagE = r(mean)
   
      * keep schools in same deciles of his, pov, and college
      use `nzoc', clear

      gen his_distance = abs(hispanic - `his')
      gen pov_distance = abs(poverty - `pov')
      gen coll_distance = abs(college - `coll')
      gen lagE_distance = abs(lagE - `lagE')
      count 
      count if pov_distance <=.25 & his_distance <=0.25
      if(r(N) >0){
        * if there are schools, then pick from that pool. otherwise just use pscore 
        keep if pov_distance <=.25 & his_distance <=0.25
      }
      count
      assert r(N) >0
      * break ties using propensity score
      gen distance = abs(phat - `pscore')
      sort distance
      keep if _n==1
      sum preferredlocationcode
      local rm_school = r(mean)
      gen school_matched_to = `z'
      * save to list of matched schools
      append using `matched_set'
      save `matched_set', replace

      * remove school from list e.g., 1:1 match
      use `nzoc', clear
      drop if preferredlocationcode==`rm_school'
      save `nzoc', replace
    

  }

  use `matched_set', clear
  keep preferredlocationcode  his_distance pov_distance coll_distance distance lagE_distance school_matched_to distance
  save $datadir/matched_set.dta, replace


end


****************************************************************************
* Cleans ZOC Application Data 
*
* Processes raw choice data and creates a clean sample, merged with information
* necessary for analysis
*
* 
****************************************************************************
capture program drop cleanChoiceData
program define cleanChoiceData

  set seed 1234
  set sortseed 1234 

  * Assignment data has the correct costcenter codes that should be in choice data
  * Use this to correct incorrectly specified cost center codes in choice data
  import delimited "$rawdata/zocassignments.csv", clear favorstrfixed    
  contract schoolname costcenter
  drop if schoolname ==""
  * only duplicates are from the same school with same plocn
  duplicates drop schoolname, force
  tempfile names
  save `names'

  * Read in choice data and correct cost center codes
  import delimited "$rawdata/zocchoices.csv", clear favorstrfixed    
  drop costcenter
  merge m:1 schoolname using `names'


  * simple cleaning and preparing  
  gen endyear = schyr
  tostring costcenter, replace
  gen preferredlocationcode = substr(costcenter,2,4)
  destring preferredlocationcode, replace
  * tag applications with missing pseudoid
  replace pseudoid=strtrim(pseudoid)
  gen missingid = pseudoid=="N"
  bys appid: egen appmissingid = max(missingid)
  drop if appmissingid==1
  drop missingid appmissingid v11 _freq _merge
  drop if pseudoid==""

   * Drop Animo Charter School because it is out of district and we have zero data 
  drop if preferredlocationcode==7693
  * Drop Angelou school - not a ZOC
  drop if preferredlocationcode ==8563
  tempfile temp
  save `temp'


  * Merge in clean zoc school list
  import delimited $rawdata/zoc_high_schools_list_for_choice.csv, clear
  tempfile zoc_schools
  save `zoc_schools'


  use `temp', clear
  split schoolname, parse("@")

  * Drop schools that are not ZOC 
  drop if preferredlocationcode==.
  drop if regexm(schoolname2, "Banning")==1 | regexm(schoolname2, "Sotomayor") | regexm(schoolname2, "South Region") | regexm(schoolname1, "Banning")
  merge m:1 preferredlocationcode using `zoc_schools', gen(cleanZOCList)

  * Make changes according to programs plocn
  replace preferredlocationcode = changepreferredto if changepreferredto !=.
  drop   schoolname1 schoolname2 zoc_school_name zoc_minyear zoc_maxyear

  label var appid "Application identifier"
  label var preferredlocationcode "School preferred location code"
  label var zone "Zone of Choice school belongs to"
  label var comp "How many competing schools (with plocns) in zone"
  drop if appid==.

  * some apps have missing rank (e.g. 2,3 ,5, 6,...) - change to 1,2,3,4,5,...
  sort endyear appid  order
  bys endyear appid: gen checkOrder = _n
  gen outoforder = checkOrder!=order
  replace order = checkOrder
  drop checkOrder
  save `temp', replace


  * Merge school characteristic data
  use $datadir/match_decomposition_school_data_z_ela_all_dummy.dta, clear 
  merge m:1 preferredlocationcode using `zoc_schools', gen(cleanZOCList) keep(1 3)
  * Make changes according to programs plocn
  replace preferredlocationcode = changepreferredto if changepreferredto !=.
  tempfile schools
  save `schools'


  use `temp', clear
  * SJ Contreras has issues with its name and 8207 (Academic Leadership Academy) is adequate 
  replace preferredlocationcode =8207 if preferredlocationcode==8527

  * Assign 2013 chars to 2014 obs (missing exams that year )
  gen y2014 = endyear==2014
  replace endyear = 2013 if y2014==1
  merge m:1 preferredlocationcode endyear using `schools', gen(mergeSchools) keep(1 3)
  replace endyear = 2014 if y2014 ==1
  drop y2014

  * For Solis LA, Sylmar Academy, Mendez Medical, Narbonne HARTS on or before 2014, assign zero
  * These were new schools and didn't have test scores (for 11th grade) until three years after their first cohort 
  replace ATE = 0 if inlist(preferredlocationcode, 7658, 7752, 7784, 8838) & endyear<=2014
  replace match = 0 if inlist(preferredlocationcode, 7658, 7752, 7784, 8838) & endyear<=2014
  replace ability = 0 if inlist(preferredlocationcode, 7658, 7752, 7784, 8838) & endyear<=2014

  * Merge student information (to get distance pairs)
  destring pseudoid, gen(studentpseudoid)
  rename preferredlocationcode choice_plocn
  format studentpseudoid %20.0g
  merge m:1 studentpseudoid endyear using $datadir/lausd2002_2021.dta, gen(mergeBuild) keep(1 3) keepusing(parentedulevelname homelanguagedescription languageclasscode ethnicity block_x block_y move censusblockid stu_city stu_zip  )

  decode censusblockid, gen(blockid)
  destring blockid, replace
  format blockid %20.0g
  sort choice_plocn censusblockid block_x block_y
  egen distance_pairs = group(choice_plocn censusblockid block_y block_x)
  save `temp', replace

  use $rawdata/plocn_cde_xwalk.dta, clear
  rename preferredlocationcode choice_plocn
  tempfile xwalk
  save `xwalk'

  * prep cde data
  import excel "$rawdata/pubschls.xlsx", sheet("School Downloadable Data") firstrow clear
  tempfile cde
  save `cde'


  use `temp', clear
  merge m:1 choice_plocn using `xwalk', keep(1 3) gen(mergeXwalk)
  tostring cdecode , replace
  replace cdecode = "0" + cdecode if length(cdecode )==6
  replace cdecode = "1964733" + cdecode
  gen CDSCode = cdecode

  merge m:1 CDSCode using `cde', keep(1 3) gen(mergeCDE) keepusing(Latitude Longitude)
  destring Latitude Longitude, replace
  geodist block_y block_x Latitude Longitude , gen(dist) miles
  save `temp', replace

  count if missing(dist) & mergeBuild!=1

  * Drop folks without addresses 
  drop if missing(dist) 

  * Merge in info on counterfactual zone school
  import delimited $rawdata/census_block_id_hs_attendance_zones.csv, clear
  format blockid10 %20.0g
  replace assigned_code = "" if assigned_code =="NA"
  destring assigned_code, replace
  duplicates tag blockid, gen(t)
  drop if t>0 & missing(assigned_code )
  duplicates report blockid10
  drop t
  duplicates tag blockid, gen(t)
  bys blockid10 : gen id = _n
  drop v10 t  haszone
  drop v1
  reshape wide assigned_code leaid schnam ncessch , i(blockid) j(id)
  rename zoc zoc_block
  rename blockid10 blockid
  tempfile cf
  save `cf'

  use `temp', clear
  merge m:1 blockid using `cf', gen(mergeCF) keep(1 3)

  * Need to make assignments for those with missing blockids or assigned zones
  * Everyone in Bell is assigned Bell HS 
  replace assigned_code1 = 8536 if (zone=="Bell" | zone=="Bell/South Gate") & missing(assigned_code1)
  replace assigned_code1 = 8536 if (zone=="Bell" | zone=="Bell/South Gate")  & missing(assigned_code1)
  * Everyone in Belmont is assigned Belmont HS 
  replace assigned_code1 = 8543 if (zone=="Belmont") & missing(assigned_code1)
  replace assigned_code1 = 8543 if (zone=="Belmont") & missing(assigned_code1)

  
  * going to assign the remaining missings at other zones (very few observations)
  gen rand = runiform()
  replace assigned_code1 = 8829 if zone=="Boyle Heights" & missing(assigned_code1) & rand<=.5
  replace assigned_code1 = 8611 if zone=="Boyle Heights" & missing(assigned_code1) & rand>.5
  replace assigned_code1 = 8575 if zone=="Carson" & missing(assigned_code1)
  replace assigned_code1 = 8679 if zone=="Eastside" & missing(assigned_code1)

  replace assigned_code1 = 8650 if zone=="Fremont"
  replace assigned_code1 = 8714 if zone=="Jefferson" & missing(assigned_code1) & rand<=.5
  replace assigned_code1 = 8716 if zone=="Jefferson" & missing(assigned_code1) & rand>.5
  replace assigned_code1 = 8721 if zone=="Jordan" & missing(assigned_code1)
  replace assigned_code1 = 8700 if zone=="HP" & missing(assigned_code1)

  replace assigned_code1 = 8779 if zone=="Narbonne" 
  replace assigned_code1 = 8618 if zone=="NE" & missing(assigned_code1) & rand<=.5
  replace assigned_code1 = 8729 if zone=="NE" & missing(assigned_code1) & rand>.5
  replace assigned_code1 = 8843 if zone=="NV" & missing(assigned_code1) & rand<=.5
  replace assigned_code1 = 8878 if zone=="NV" & missing(assigned_code1) & rand>.5

  replace assigned_code1 = 8501 if zone =="RFK" & missing(assigned_code1)

  replace assigned_code1 = 8871 if zone=="South Gate" & missing(assigned_code1) & rand<=.5
  replace assigned_code1 = 8881 if zone=="South Gate" & missing(assigned_code1) & rand>.5

  * make the southeast high school zone adjustment (based on geography)
  replace assigned_code1 = 8881 if block_x <=-118.23 & block_x >=-118.225 & assigned_code1==8871

  * make changes according to those with 2 assigned schools -- pick one that fits their zone
  replace assigned_code1= assigned_code2 if zone=="Bell" & assigned_code2==8536
  replace assigned_code1= assigned_code2 if zone=="Bell/South Gate" & assigned_code2==8536
  replace assigned_code1= assigned_code2 if zone=="Belmont" & assigned_code2==8543
  replace assigned_code1= assigned_code2 if zone=="Fremont" & assigned_code2==8650

  * fill in those with empty zones 
  replace zone = "NE" if schoolname=="Academy of Environmental & Social Policy (ESP) @ Lincoln High School"
  replace zone = "Belmont" if schoolname=="Civitas School of Leadership @ Edward R. Roybal Learning Center"
  replace zone = "Eastside" if schoolname =="East Los Angeles Performing Arts Academy - Dance Department @ Torres High School"
  replace zone = "Eastside" if schoolname =="East Los Angeles Performing Arts Academy - Music Department @ Torres High School"
  replace zone = "Eastside" if schoolname =="East Los Angeles Performing Arts Academy - Theater Department @ Torres High School"
  replace zone = "Belmont" if schoolname=="LA Teacher Preperatory High School @ Edward R. Roybal Learning Center"
  replace zone = "NE" if schoolname =="Leadership in Entertainment and Media Arts (LEMA) @ Lincoln High School"
  replace zone = "Belmont" if schoolname=="Los Angeles Teacher Preparatory High School @ Edward R. Roybal Learning Center"
  replace zone = "Boyle Heights" if schoolname =="Roosevelt High School?s Academy of Environmental and Social Policy (ESP) @ East Los Angeles Skill Ce"
  
  * fix this incorrectly tagged zone 
  replace zone = "NE" if choice_plocn==8729 // Roosevelt with the change in preferred location code during the sample
  replace zone = "BH" if zone=="Boyle Heights"
  replace zone = "SG" if zone=="South Gate"
  save $datadir/choiceData.dta, replace


end


********************************************************************
* Calculate OVG using the first two cohorts
* 
********************************************************************
capture program drop ovg
program define ovg 

  * add CDE code to merge in lat-lon data 
  tempfile dist
  use $rawdata/plocn_cde_xwalk.dta, clear
  rename preferredlocationcode choice_plocn
  tostring cdecode , replace
  replace cdecode = "0" + cdecode if length(cdecode )==6
  replace cdecode = "1964733" + cdecode
  gen CDSCode = cdecode
  tempfile xwalk
  save `dist', replace

  * prep cde data
  import excel "$rawdata/pubschls.xlsx", sheet("School Downloadable Data") firstrow clear
  tempfile cde
  save `cde', replace

  * merge in lat-lon data by CDE code
  use `dist' , clear
  merge m:1 CDSCode using `cde', keep(1 3) gen(mergeCDE) keepusing(Latitude Longitude)
  destring Latitude Longitude, replace
  rename choice_plocn preferredlocationcode
  keep preferredlocationcode Latitude Longitude
  rename (Latitude Longitude) (choice_y choice_x)
  save `dist', replace

  
  use $datadir/lausd2002_2021, clear
  keep if endyear >=2010
  gen score_low = z_ela_all <-.5 & !missing(z_ela_all)
  gen score_high = z_ela_all >.5 & !missing(z_ela_all)
  gen score_avg = score_low==0 & score_high==0
  gen hispanic = ethnicity==5
  gen black = ethnicity==3
  keep studentpseudoid endyear score_* gradecode poverty english_learner spanish_at_home college migrant female hispanic black incidents
  local vars "poverty english_learner spanish_at_home college migrant female hispanic black incidents"
  foreach var of local vars{
    rename `var' stu_`var'
  }
  tempfile scores
  save `scores'

  tempfile master_ests
  clear 
  save `master_ests', emptyok


  * merge new data with clean choice data 
  use $datadir/choiceData.dta, clear
  merge m:1 studentpseudoid endyear using `scores', gen(mergeStuInfo) keep(1 3)
  * assign the missings to the average group  ~ 4% of obs
  replace score_avg = 1 if missing(score_avg)
  egen category = group(score_low score_high score_avg)

  drop if dist==.
  gen dist_parm = .
  * fix potentially problematic distances at 25, the 95th percentile of the distribution
  replace dist = 25 if dist>25
  replace zone ="Bell" if zone=="Bell/South Gate"
  bys appid endyear: egen mode_zone = mode(zone), maxmode
  replace zone = mode_zone
  drop mode_zone

  * any remaining fixes for the loop 
  replace zone = "Jordan" if choice_plocn==7693
  replace zone ="Bell" if zone=="Bell/South Gate"
  replace zone = "BH" if zone=="Boyle Heights"
  replace zone = "SG" if zone=="South Gate"

  * save datasets for rologit estimations 
  levelsof category, local(levels)
  foreach l of local levels{
    preserve   
    keep if category == `l'
    save $datadir/rologit`l'.dta, replace 
    restore 
  }
  tempfile temp 
  save `temp'
  * estimate within each category
  foreach l of local levels{
    use $datadir/rologit`l'.dta, clear 
    * going to estimate preferences for the first two years 
    levelsof zone if (endyear==2013 | endyear ==2014) , local(zones)
    foreach zone of local zones{
        * dummies by group for schools 
        levelsof choice_plocn if (endyear==2013 | endyear==2014 ) & zone=="`zone'" , local(choices)
        foreach c of local choices{
         gen dummy_`zone'_`c' = choice_plocn==`c'
        }
    }
    foreach zone of local zones{
    levelsof choice_plocn if (endyear==2013 | endyear==2014) & zone=="`zone'" , local(choices)
    local leaveOut = 1
    * leave out least popular school
    foreach c of local choices{
      sum dummy_`zone'_`c' if (endyear==2013 | endyear==2014) & zone == "`zone'" & order ==1
      if r(mean) < `leaveOut' & r(mean)>0{
        local leaveOut=r(mean)
        local leaveOutSchool = `c'
      }
    }
    if `leaveOut' < 1 {
      rename dummy_`zone'_`leaveOutSchool'  leaveOut_`zone'_`leaveOutSchool'
      replace dist = 0 if choice_plocn==`leaveOutSchool'
    }
    rologit order  dummy_`zone'_*  dist  if (endyear==2013 | endyear==2014 ) & zone=="`zone'" ,   group(appid  ) reverse vce(cluster appid )
    replace dist_parm = _b[dist] if zone=="`zone'"

    * keep track of distance coefficient 
    save $datadir/rologit`l'.dta, replace 
    preserve
    parmest, norestore 
    * add left out school (normalized to zero)
    expand 2 in 1 
    drop if regexm(parm, "dist")
    replace parm = "leaveout_`zone'_`leaveOutSchool'" if _n ==_N 
    replace estimate = 0 if _n==_N 
    gen leaveOutSchool = `leaveOutSchool'
    gen group = "`l'"
    append using `master_ests'
    save `master_ests', replace
    restore 
    }
  }
  * keep track of full data with distance coefficients augmented as covariates/columns 
  use $datadir/rologit1.dta, clear
  append using $datadir/rologit2.dta
  append using $datadir/rologit3.dta 

  gen cat = "avg" if score_avg==1
  replace cat = "low" if score_low==1
  replace cat = "high" if score_high==1
  save `temp', replace 

   * some shrinkage on the estimates 
  use `master_ests', clear
  ebayes estimate stderr, gen(delta)
  drop if parm =="o.dist"
  split parm, parse("_")

  rename parm3 choice_plocn
  rename parm2 zone
  gen cat = "avg" if group=="1"
  replace cat = "high" if group=="2"
  replace cat = "low" if group=="3"

  keep estimate delta choice_plocn zone cat 
  sort zone cat choice_plocn
  bys zone cat: gen id = _n
  reshape wide choice_plocn estimate delta, i(zone cat) j(id)
  save `master_ests', replace


  use `temp', clear
  merge m:1 zone cat using `master_ests', gen(mergeDeltas)

  * Need distances to each option in choice set
  forvalues x = 1/7{
    destring choice_plocn`x', replace
    rename choice_plocn`x' preferredlocationcode
    merge m:1 preferredlocationcode using `dist', keep(1 3) nogen
    geodist   block_y block_x  choice_y choice_x , gen(dist`x') miles
    drop choice_x choice_y
    rename preferredlocationcode choice_plocn`x'
  }

  egen minDistToSchools = rowmin(dist1-dist7)
  gen minDistSchool = choice_plocn1


  * calculate zone e(max)
  forvalues x = 1/7{
    ** those with missing estimates (thus no xth school in that zone) should be zeros in emax calculations
    replace estimate`x' = 0 if missing(estimate`x')
    replace dist`x' = 0 if missing(dist`x')
    replace delta`x' = 0 if missing(delta`x')
    gen u`x'_estimate = exp(estimate`x' + dist`x'*dist_parm)
    gen u`x'_delta = exp(delta`x' + dist`x'*dist_parm)
    * if u`x' ==1, that means there wasn't an `x'th choice (exp(u)=0) so do not accidentally add 1s in the OVG calculation
    replace u`x'_estimate = 0 if u`x'_estimate ==1
    replace u`x'_delta = 0 if u`x'_delta ==1
  }
  * sum first and then log i.e., log sum  
  egen zone_u_sum_estimate = rowtotal(u*_estimate)
  gen zone_log_sum_estimate = ln(zone_u_sum_estimate)
  egen zone_u_sum_delta = rowtotal(u*_delta)
  gen zone_log_sum_delta = ln(zone_u_sum_delta)

  * calculate counterfactual e(max)
  tostring choice_plocn, replace
  local deltas "estimate delta"
  foreach delta of local deltas{
    * this will sum all except counterfactual school
    gen additional_u_`delta'  = exp((`delta'1 + dist_parm*dist1))*(assigned_code1!=choice_plocn1) + exp((`delta'2 + dist_parm*dist2)) *(assigned_code1!=choice_plocn2) + exp((`delta'3  + dist_parm*dist3))*(assigned_code1!=choice_plocn3) + exp((`delta'4 + dist_parm*dist4))*(assigned_code1!=choice_plocn4) + exp((`delta'5 + dist_parm*dist5)) *(assigned_code1!=choice_plocn5) + exp((`delta'6 + dist_parm*dist6))*(assigned_code1!=choice_plocn6) + exp((`delta'7  + dist_parm*dist7))*(assigned_code1!=choice_plocn7)
    gen log_additional_u_`delta' = ln(additional_u_`delta')
    * this sums only counterfactual school
    gen fallback_u_`delta' = (`delta'1 + dist_parm*dist1)*(assigned_code1==choice_plocn1) + (`delta'2 + dist_parm*dist2) *(assigned_code1==choice_plocn2) + (`delta'3  + dist_parm*dist3)*(assigned_code1==choice_plocn3) + (`delta'4 + dist_parm*dist4)*(assigned_code1==choice_plocn4) + (`delta'5 + dist_parm*dist5) *(assigned_code1==choice_plocn5) + (`delta'6 + dist_parm*dist6) *(assigned_code1==choice_plocn6) + (`delta'7  + dist_parm*dist7) *(assigned_code1==choice_plocn7)
    gen ovg_`delta' = abs((1/dist_parm)*(zone_log_sum_`delta' - fallback_u_`delta'))
    gen cs_new_`delta' =  abs((1/dist_parm)*(log_additional_u_`delta'))
  }


  * one obs per person
  keep if order==1

  replace zone_log_sum_estimate = abs(zone_log_sum_estimate/dist_parm)
  replace zone_log_sum_delta = abs(zone_log_sum_delta/dist_parm)
  gen lnovg = ln(ovg_delta )
  save $datadir/ovg_micro_data, replace
  
  preserve 
  keep if order==1
  keep if endyear == 2013 | endyear ==2014
  collapse (mean) ovg_estimate, by(assigned_code1  )
  rename assigned_code1 preferredlocationcode
  rename ovg_estimate school_ovg_estimate
  drop if missing(preferredlocationcode)
  save $datadir/ovg_micro_data_school, replace
  restore 

end


********************************************************************
* Take everything and create analysis test score dataset 
* 
********************************************************************
capture program drop prepEventStudy
program define prepEventStudy
syntax, [data(string) traditional(string) subjectMath(string)]

  import delimited "$rawdata/zoc_high_schools_list.csv", clear
  tempfile zoc_schools
  save `zoc_schools'

  * ovg measure
  use $datadir/ovg_micro_data, clear
  duplicates drop studentpseudoid , force
  drop if missing(ovg_estimate)

  * census block level ovg 
  bys censusblockid : egen blockOVG = mean(ovg_estimate )
  gen ovg_estimate2 = blockOVG 

  * flag high ovg students 
  sum ovg_estimate, detail
  replace ovg_estimate = 0 if ovg_estimate==.

  gen highOVG_1 =  ovg_estimate <= r(p25) & !missing(ovg_estimate)
  gen highOVG_2 = ovg_estimate > r(p25) & !missing(ovg_estimate) & ovg_delta <=r(p50)
  gen highOVG_3 = ovg_estimate > r(p50) & !missing(ovg_estimate) & ovg_delta <=r(p75)
  gen highOVG_4 = ovg_estimate > r(p75) & !missing(ovg_estimate)

  * save the block-level ovg estimates (to add for the pre-period groups later)
  preserve 
  collapse (max) ovg_estimate highOVG_1 highOVG_2 highOVG_3 highOVG_4 , by(censusblockid)
  drop if missing(censusblockid)
  tempfile temp1 
  save `temp1'
  restore 

  rename schoolname mostpreferredname
  destring choice_plocn , replace
  rename choice_plocn mostpreferredlocationcode
  rename dist mostpreferreddistance

  keep studentpseudoid schyr appdate mostpreferredname mostpreferredlocationcode mostpreferreddistance    ovg_delta  highOVG_1 highOVG_2 highOVG_3 highOVG_4 ovg_estimate
  tempfile ovg
  save `ovg'


  if "`subjectMath'"!="TRUE"{
    use $datadir/match_model_estimates_and_output_z_ela_all_dummy, clear 
    drop sydummy* dummy* 
    * merge betas 
    merge m:1 preferredlocationcode using $datadir/match_coefficients_clean_z_ela_all_dummy, gen(mergeBetas)
    * merge alphas 
    merge m:1 preferredlocationcode endyear using $datadir/va_coefficients_clean_z_ela_all_dummy, gen(mergeAlphas)
    rename vaestimate ATE 
    local nameappend "_ela"
    local studata "_z_ela_all_dummy"

  }
  if "`subjectMath'"=="TRUE"{
    use $datadir/match_model_estimates_and_output_z_math_all_dummy, clear 
    drop sydummy* dummy* 
    * merge betas 
    merge m:1 preferredlocationcode using $datadir/match_coefficients_clean_z_math_all_dummy, gen(mergeBetas)
    * merge alphas 
    merge m:1 preferredlocationcode endyear using $datadir/va_coefficients_clean_z_math_all_dummy, gen(mergeAlphas)
    rename vaestimate ATE 
    local nameappend "_math"
    local studata "_z_math_all_dummy"
  }

  merge m:1 preferredlocationcode using `zoc_schools', keep(1 3) // unmatched are non-zoc schools
  drop _merge
  merge m:1 preferredlocationcode using $datadir/pweights.dta, gen(mergePweights) keep(1 3) keepusing(phat outliers) // unmatched are schools excluded from candidate controls
  merge m:1 preferredlocationcode using $datadir/matched_set.dta, gen(mergeMatchedSchools) keep(1 3) // unmatched are control group schools not matched to a zoc school
  merge m:1 studentpseudoid  using `ovg', gen(mergeOVG) keep(1 3) // unmatched are cohorts we don't have 11th grade scores for
  merge m:1 studentpseudoid endyear using $datadir/student_match_data`studata'.dta, gen(stuMatchData) keep(1 3)  keepusing(ability match) // merge student micro level match data 

  merge m:1 preferredlocationcode using $datadir/ovg_micro_data_school.dta, gen(mergeSchoolOVG) keep(1 3)

  * assign ovg to those in the pre- and post-period based on their censusblock
  merge m:1 censusblockid using `temp1', gen(mergeOVGblock)
  local ovgvars "ovg_estimate highOVG_1 highOVG_2 highOVG_3 highOVG_4"
  foreach var of local ovgvars{
    replace `var' = 0 if missing(`var')
  }

  * want to restrict to schools we observe in the pre and post period *
  drop minYear age
  bys preferredlocationcode : egen minYear = min(endyear)
  bys preferredlocationcode : egen maxYear = max(endyear)
  bys preferredlocationcode: gen numObs = _N

  keep if endyear>=2008
  gen b5 = zoc_hs*(endyear==2008)
  gen b4 = zoc_hs*(endyear==2009)
  gen b3 = zoc_hs*(endyear==2010)
  gen b2 = zoc_hs*(endyear==2011)
  *gen b1 = zoc_hs*(endyear==2012)
  gen a0 = zoc_hs*(endyear==2013)
  gen a2 = zoc_hs*(endyear==2015)
  gen a3 = zoc_hs*(endyear==2016)
  gen a4 = zoc_hs*(endyear==2017)
  gen a5 = zoc_hs*(endyear==2018)
  gen a6 = zoc_hs*(endyear==2019)

  replace ovg_estimate  = 0 if missing(ovg_estimate )
  replace ovg_delta = 0 if missing(ovg_delta)
  gen post_zoc_ovg = (zone!="")*(endyear>=2013)*ovg_estimate
  replace ovg_estimate  = 0 if missing(ovg_estimate )
  gen post_zoc_delta= (zone!="")*(endyear>=2013)*ovg_delta

  gen analysis_schools = (mergeMatchedSchools==3) | (zone!="")

  cap gen eth_5 = hispanic
  cap gen eth_3 = black

  gen post_zoc = (endyear>=2013)*(zone!="")
    local xs "lagE lagM black  english_learner female hispanic migrant college  poverty spanish_at_home  numSuspensions "
  foreach x of local xs{
    gen char12_`x' = `x' if endyear == 2012
    bys preferredlocationcode : egen s_mean_`x'_12 = mean(char12_`x')
    gen trend_`x'_1 = s_mean_`x'_12 * endyear
    gen trend_`x'_2 = s_mean_`x'_12^2 * endyear
    gen trend_`x'_3 = s_mean_`x'_12^3 * endyear
    drop char12_`x' s_mean_`x'_12
  }
  * want to make within zone assignment of high OVG school 
  replace zone = "Bell" if zone== "Bell/South Gate"
  bys zone: egen maxSchoolOVG = max(school_ovg_estimate)
  gen highOVGSchool = school_ovg_estimate == maxSchoolOVG & !missing(school_ovg_estimate)


  * Drop schools continuation schools
  * Drop choice schools 
  * Analysis focuses on traditional public HS
  decode preferredlocationname, gen(prefname)
  drop if regexm(prefname, "CDS")
  drop if regexm(prefname, "OPP")
  drop if regexm(prefname, "CONTN")
  drop if regexm(prefname, "CYESIS")
  drop if regexm(prefname, "ANGEL'S GATE HS")
  drop if regexm(prefname, "SP ED")
  drop if regexm(prefname, "MAG")
  drop if regexm(prefname, "DBM")
  replace prefname = proper(prefname )

  cap drop continuation 
  gen continuation = 1 if regexm(prefname, "Boyle Heights HS")
  replace continuation = 1 if regexm(prefname, "San Antonio")
  replace continuation = 1 if regexm(prefname, "Stoney Point")
  replace continuation = 1 if regexm(prefname, "Mt Lukens")
  replace continuation = 1 if regexm(prefname, "Highland Park")
  replace continuation = 1 if regexm(prefname, "Cheviot Hills")
  replace continuation = 1 if regexm(prefname, "View Park")
  replace continuation = 1 if regexm(prefname, "Aggeler")
  local schools Boyle Heights ///
            Metropolitan ///
            Monterey ///
            Odyssey ///
            Pueblo ///
            Ramona ///
            Rodia ///
            Addams ///
            Einstein ///
            Grey ///
            Independence ///
            Owensmouth ///
            Thoreau ///
            Wooden ///
            Earhart ///
            Burke ///
            Evergreen ///
            Lewis ///
            London ///
            Mission ///
            Rogers ///        
            Central ///
            Kahlo ///
            Avalon ///
            Hope ///
            Moneta ///
            Patton  ///
            Ellington ///
            Phoenix ///
            Whitman ///
            Young 
  foreach school of local schools{
    replace continuation = 1 if regexm(prefname, "`school'")
  }
  replace continuation= 0 if missing(continuation)
  drop if analysis_schools ==0 & (regexm(prefname , "Mag") | regexm(prefname , "Mg"))

  drop if continuation==1

  save $datadir/zoc_analysis_data`nameappend'.dta, replace 
end

********************************************************************
* Clean raw NSC data 
* 
********************************************************************
capture program drop cleanNSC 
program define cleanNSC 

* This script starts with a database of all HS graduates 
import delimited "$rawdata/National_Student_Clearinghouse_2009_2013.csv", clear
drop if missing(student_pseudo_id) 
tempfile one 
save `one'

import delimited "$rawdata/National_Student_Clearinghouse_2014_2021.csv", clear 
drop if missing(student_pseudo_id) 

append using `one'

* data is stacked so some duplicates (on every variable) are naturally created 
duplicates drop student_pseudo_id requester_return_field record_found_y_n high_school_code high_school_grad_date college_code_branch college_name college_state college_type_2_4_year public_private enrollment_begin enrollment_end enrollment_status graduated graduation_date degree_title major college_sequence, force

tostring high_school_grad_date , gen(hs_grad_date)
gen hs_grad_year = substr(hs_grad_date,1,4)




tostring enrollment_begin, gen(term_start)
tostring enrollment_end, gen(term_end)
replace term_start = "" if term_start=="."
replace term_end = "" if term_end=="."
gen term_start_year = substr(term_start, 1,4)
gen term_end_year = substr(term_end, 1,4)
gen term_start_month = substr(term_start, 5,2)
gen term_end_month   = substr(term_end, 5,2)
destring term_start_* term_end_*, replace 


* only keep students that ever make it into NSC 
gen everNSC = record_found_y_n=="Y"
bys student_pseudo_id: egen ever_in_college = max(everNSC)

* make an auxillary dataset for high school graduates 
preserve 
contract student_pseudo_id
rename student_pseudo_id studentpseudoid  
drop _freq  
gen graduate_hs = 1 
save $datadir/lausd_2008_2021_hs_graduates.dta, replace 
restore 

drop if ever_in_college==0
drop everNSC ever_in_college requester_return_field

* Number of colleges a student attends 
egen number_coll_attended = nvals(college_code_branch), by(student_pseudo_id)
replace number_coll_attended = 0 if missing(number_coll_attended)
bys student_pseudo_id:  egen num_colleges_attended = max(number_coll_attended)
drop number_coll_attended


rename record_found_y_n record_found_yn
rename student_pseudo_id studentpseudoid 
rename public_private publicprivate
format studentpseudoid %20.0g


tempfile combined 
save `combined'



tempfile firstyr
* Keep track of info from first and last college attended 
preserve 
	
	drop if record_found_yn=="N"
	drop if missing(term_start_year) | missing(term_end_year)

	* tag first observation in NSC
	sort studentpseudoid term_start_year term_start_month
	bys studentpseudoid : gen first_college_obs = _n==1

	* first year attended college 
	gen fcy = term_start_year if first_college_obs==1
	bys studentpseudoid: egen first_college_year = max(fcy)
	gen missing_first_year = missing(first_college_year)
	drop fcy 

	* type -2year or 4year of first college 
	gen ftc = college_type_2_4_year if first_college_obs==1
	bys studentpseudoid: egen first_college_type = mode(ftc) 	
	drop ftc

	* first_year is UC
	gen fuc = regexm(college_name, "UNIVERSITY OF CALIFORNIA") if first_college_obs==1
	bys studentpseudoid: egen first_college_uc = max(fuc)
	drop fuc 

	* private or public as first year 
	gen fp =publicprivate=="Private" if first_college_obs ==1
	bys studentpseudoid: egen first_college_private = max(fp) 	
	drop fp	

	* out of state as first year 
	gen fo = college_state !="CA" if first_college_obs==1
	bys studentpseudoid: egen first_out_of_state = max(fo)
	drop fo 

	* last college observation in NSC 
	sort studentpseudoid term_start_year term_start_month
	bys studentpseudoid : gen last_college_obs = _n==_N

	* last year attended college 
	gen lcy = term_start_year if last_college_obs==1
	bys studentpseudoid: egen last_college_year = max(lcy)
	gen missing_last_year = missing(last_college_year)
	drop lcy 

	* type -2year or 4year of first college 
	gen ltc = college_type_2_4_year if last_college_obs==1
	bys studentpseudoid: egen last_college_type = mode(ltc) 	
	drop ltc

	* private or public as first year 
	gen lp =publicprivate=="Private" if last_college_obs ==1
	bys studentpseudoid: egen last_college_private = max(lp) 	
	drop lp	

	* out of state as first year 
	gen lo = college_state !="CA" if last_college_obs==1
	bys studentpseudoid: egen last_out_of_state = max(lo)
	drop lo 

	* ever graduated 
	gen grad2 = graduated=="Y" & college_type_2_4_year=="2-year"
	gen grad4 = graduated=="Y" & college_type_2_4_year=="4-year"
	bys studentpseudoid: egen ever_graduated_2 = max(grad2)
	bys studentpseudoid: egen ever_graduated_4 = max(grad4)
	drop grad2 grad4
 

	keep studentpseudoid first_college_year first_college_type ///
			first_out_of_state first_college_private ///
			last_college_year last_college_type ///
			last_college_private last_out_of_state ///
			ever_graduated_2 ever_graduated_4 first_college_uc
	duplicates drop studentpseudoid, force 



	* label variables 
	label var first_college_year "First observed year in college"
	label var last_college_year "Last observed year in NSC data"
	label var first_college_type "First college type (2-year versus 4-year)"
	label var first_out_of_state "Indicator if first college out-of-state"
	label var first_college_private "Indicator if first college was private"
	label var last_college_private "Indicator if last observed college was private"
	label var last_out_of_state "Indicator if last observed college was out-of-state"
	label var ever_graduated_2 "Indicator if ever graduated from 2-year"
	label var ever_graduated_4 "Indicator if ever graduated from 4-year"
	label var first_college_uc "Indicator if first college was a UC"
	
	save `firstyr'
restore


* Keep a record of all the degrees earned from a 4-year 
preserve 
	keep studentpseudoid college_type_2_4_year major graduation_date degree_title 
	keep if college_type_2_4_year =="4-year"
	keep if graduation_date !=.
	keep if !missing(major)
	sort studentpseudoid graduation_date 
	* Order degrees chronologically 
	bys studentpseudoid : gen majorid=_n
	drop college_type_2_4_year 
	* change to student level 
	reshape wide graduation_date degree_title major , i(studentpseudoid) j(majorid)
	tempfile fouryeardegrees
	save `fouryeardegrees'
restore 	


* Make a student-level dataset 
use `combined',  clear 
gen hs_grad_month = substr(hs_grad_date,5,2)
contract studentpseudoid   hs_grad_year hs_grad_month  num_colleges_attended
* some students have two graduation dates 
sort studentpseudoid hs_grad_year hs_grad_month
bys studentpseudoid: gen id = _n
keep if id==1
isid studentpseudoid
drop _freq id

merge m:1 studentpseudoid using `firstyr', gen(mergeFirstYearInfor) 
merge m:1 studentpseudoid using `fouryeardegrees', gen(merge4yrDegreeInfo)


save $datadir/lausd_2008_2021_nsc_student_unique.dta, replace 


end 


********************************************************************
* Estimate mean utilities used in Table 2 
* 
********************************************************************
capture program drop estimateDeltasByYear
program define estimateDeltasByYear
  set seed 12345
  set sortseed 12345
  clear  
  tempfile master
  save `master', emptyok 

  clear 
  tempfile masterdist 
  save `masterdist', emptyok
  
  use $datadir/lausd2002_2021, clear
  keep if endyear >=2010
  gen score_low = z_ela_all <-.5 & !missing(z_ela_all)
  gen score_high = z_ela_all >.5 & !missing(z_ela_all)
  gen score_avg = score_low==0 & score_high==0
  gen hispanic = ethnicity==5
  gen black = ethnicity==3
  keep studentpseudoid endyear score_* gradecode poverty english_learner spanish_at_home college migrant female hispanic black incidents
  local vars "poverty english_learner spanish_at_home college migrant female hispanic black incidents"
  foreach var of local vars{
    rename `var' stu_`var'
  }
    gen score_cat = "Low" if score_low==1
  replace score_cat = "Average" if score_avg==1
  replace score_cat = "High" if score_high==1
  tempfile scores
  tempfile scores
  save `scores'

  use $datadir/choiceData.dta, clear
  rename choice_plocn preferredlocationcode
  replace preferredlocationcode=8829 if regexm(schoolname, "Roosevelt")
  replace preferredlocationcode=8611 if regexm(schoolname, "Mendez")
  replace preferredlocationcode=7722 if regexm(schoolname, "Angelou")
  replace preferredlocationcode=8544 if regexm(schoolname, "Roybal")
  replace preferredlocationcode=8543 if regexm(schoolname, "Belmont")
  replace preferredlocationcode=8729 if regexm(schoolname, "Lincoln")
  replace preferredlocationcode=8721 if regexm(schoolname, "Jordan")
  replace preferredlocationcode = 8829 if preferredlocationcode==7749
  tempfile choice
  save `choice'


  merge m:1 studentpseudoid endyear using `scores', gen(mergeStuScores) keep(1 3)
  * assign the missings to the average group  ~ 4% of obs
  replace score_avg = 1 if missing(score_avg)
  
  encode score_cat, gen(scoreid)
  egen cov_cell = group(scoreid )

  set matsize 10000
  set emptycells drop

  drop if zone=="Bell/South Gate"
  replace zone = "BH" if zone=="Boyle Heights"
  replace zone = "SG" if zone =="South Gate"
  levelsof zone , local(zones)
  levelsof score_cat, local(groups) 
  destring costcenter, gen(programid)

  forvalues year = 2013/2019{
    foreach zone of local zones{
      foreach group of local groups{
  
        count if zone=="`zone'" & endyear == `year' & score_cat=="`group'"
        if(r(N)>0){
          preserve 
          display "Working on `zone' in `year'"
          keep if zone=="`zone'" & endyear==`year' & score_cat=="`group'"
          * dummies by group for schools
          levelsof preferredlocationcode  if zone=="`zone'" , local(choices)
          foreach c of local choices{
            gen dummy_`zone'_`c' = preferredlocationcode==`c'
            
          }
          * Normalization 
          sum preferredlocationcode
          local leaveOutSchool = r(min)
          replace dist = 0 if preferredlocationcode==`leaveOutSchool'
          
          rename dummy_`zone'_`leaveOutSchool' leaveOut_`zone'_`leaveOutSchool'

          cap rologit order dummy_`zone'_* dist   , group(appid) reverse vce(cluster appid  )
       
          if _rc==0{
          parmest, norestore
          
          split parm, parse("_")
          gen zone = parm2
          gen preferredlocationcode = parm3 
          destring preferredlocationcode, replace
          gen group = "`group'"

          keep parm preferredlocationcode zone  group estimate stderr
          order preferredlocationcode, first
          
          gen year = `year'

          * save a version with distance 
          replace parm = "dist" + "_" + "`zone'" + "_" + "`year'" + "_" + "`group'" if parm=="dist"
          tempfile dist_`zone'_`year'_`group'
          save `dist_`zone'_`year'_`group''

          *save a version without distance 
          expand 2 in 1 
          replace preferredlocationcode = `leaveOutSchool' if _n==_N
          replace estimate = 0 if _n==_N 
          replace stderr = 0 if _n==_N 

          drop if missing(preferredlocationcode)
          tempfile temp_`zone'_`year'_`group'
          save `temp_`zone'_`year'_`group''
          }
          else{
            clear 
            tempfile temp_`zone'_`year'_`group'
            save `temp_`zone'_`year'_`group'', emptyok
          }

          restore 
        }
      }
      
    }
  }

  *append all
  use `master', clear 
  forvalues year = 2013/2019{
    foreach zone of local zones{
      foreach group of local groups{
        cap append using `temp_`zone'_`year'_`group''
      }
    }
  }
  replace group ="avg" if group=="Average"
  replace group = "low" if group=="Low"
  replace group="high" if group=="High"
gen costcenter = preferredlocationcode
  save $datadir/deltas2013_2019.dta, replace


  use `masterdist', clear 
  forvalues year = 2013/2019{
    foreach zone of local zones{
      foreach group of local groups{
        cap append using `dist_`zone'_`year'_`group''
      }
    }
  }
  replace group ="avg" if group=="Average"
  replace group = "low" if group=="Low"
  replace group="high" if group=="High"
  gen costcenter = preferredlocationcode
  save $datadir/deltas2013_2019_with_distance.dta, replace

end

********************************************************************
* Allow for more flexible distance in deltas
* 
********************************************************************
capture program drop estimateDeltasByYearRobustness
program define estimateDeltasByYearRobustness
  set seed 12345
  set sortseed 12345
  clear  
  tempfile master
  save `master', emptyok 

  clear 
  tempfile masterdist 
  save `masterdist', emptyok

  use $datadir/lausd2002_2021, clear
  keep if endyear >=2010
  gen score_low = z_ela_all <-.5 & !missing(z_ela_all)
  gen score_high = z_ela_all >.5 & !missing(z_ela_all)
  gen score_avg = score_low==0 & score_high==0
  gen hispanic = ethnicity==5
  gen black = ethnicity==3
  keep studentpseudoid endyear score_* gradecode poverty english_learner spanish_at_home college migrant female hispanic black incidents
  local vars "poverty english_learner spanish_at_home college migrant female hispanic black incidents"
  foreach var of local vars{
    rename `var' stu_`var'
  }
    gen score_cat = "Low" if score_low==1
  replace score_cat = "Average" if score_avg==1
  replace score_cat = "High" if score_high==1
  tempfile scores
  tempfile scores
  save `scores'

  use $datadir/choiceData.dta, clear
  rename choice_plocn preferredlocationcode
  replace preferredlocationcode=8829 if regexm(schoolname, "Roosevelt")
  replace preferredlocationcode=8611 if regexm(schoolname, "Mendez")
  replace preferredlocationcode=7722 if regexm(schoolname, "Angelou")
  replace preferredlocationcode=8544 if regexm(schoolname, "Roybal")
  replace preferredlocationcode=8543 if regexm(schoolname, "Belmont")
  replace preferredlocationcode=8729 if regexm(schoolname, "Lincoln")
  replace preferredlocationcode=8721 if regexm(schoolname, "Jordan")
  replace preferredlocationcode = 8829 if preferredlocationcode==7749
  tempfile choice
  save `choice'


  merge m:1 studentpseudoid endyear using `scores', gen(mergeStuScores) keep(1 3)
  encode score_cat, gen(scoreid)

  egen cov_cell = group(scoreid )

  set emptycells drop
  set matsize 10000
  drop if zone=="Bell/South Gate"
  replace zone = "BH" if zone=="Boyle Heights"
  replace zone = "SG" if zone =="South Gate"
  *drop if zone=="BH"
  levelsof zone , local(zones)
  levelsof score_cat, local(groups) 
  gen dist_sq = dist^2 
  forvalues year = 2013/2019{
    foreach zone of local zones{
      foreach group of local groups{
        count if zone=="`zone'" & endyear == `year' & score_cat=="`group'"
        if(r(N)>0){
          preserve 
          display "Working on `zone' in `year'"
          keep if zone=="`zone'" & endyear==`year' & score_cat=="`group'"
          * dummies by group for schools
          levelsof preferredlocationcode  if zone=="`zone'" , local(choices)
          foreach c of local choices{
            gen dummy_`zone'_`c' = preferredlocationcode==`c'
          }
          * Normalization 
          sum preferredlocationcode
          local leaveOutSchool = r(min)
          replace dist = 0 if preferredlocationcode==`leaveOutSchool'
          replace dist_sq = 0 if preferredlocationcode==`leaveOutSchool'
          
          rename dummy_`zone'_`leaveOutSchool' leaveOut_`zone'_`leaveOutSchool'

          cap rologit order dummy_`zone'_* dist  dist_sq , group(appid) reverse vce(robust  )
       
          if _rc==0{
          parmest, norestore
          
          split parm, parse("_")
          gen zone = parm2
          gen preferredlocationcode = parm3 
          destring preferredlocationcode, replace
          gen group = "`group'"

          keep parm preferredlocationcode zone  group estimate stderr
          order preferredlocationcode, first
          gen year = `year'

          * save a version with distance 
          replace parm = "dist" + "_" + "`zone'" + "_" + "`year'" + "_" + "`group'" if parm=="dist"
          replace parm = "dist_sq" + "_" + "`zone'" + "_" + "`year'" + "_" + "`group'" if parm=="dist_sq"
          tempfile dist_`zone'_`year'_`group'
          save `dist_`zone'_`year'_`group''

          *save a version without distance 
          expand 2 in 1 
          replace preferredlocationcode = `leaveOutSchool' if _n==_N
          replace estimate = 0 if _n==_N 
          replace stderr = 0 if _n==_N 

          drop if missing(preferredlocationcode)
          tempfile temp_`zone'_`year'_`group'
          save `temp_`zone'_`year'_`group''
          }
          else{
            clear 
            tempfile temp_`zone'_`year'_`group'
            save `temp_`zone'_`year'_`group'', emptyok
          }

          restore 
        }
      }
      
    }
  }

  *append all
  use `master', clear 
  forvalues year = 2013/2019{
    foreach zone of local zones{
      foreach group of local groups{
        cap append using `temp_`zone'_`year'_`group''
      }
    }
  }
  replace group ="avg" if group=="Average"
  replace group = "low" if group=="Low"
  replace group="high" if group=="High"

  save $datadir/deltas2013_2019_update_dist_sq.dta, replace


  use `masterdist', clear 
  forvalues year = 2013/2019{
    foreach zone of local zones{
      foreach group of local groups{
        cap append using `dist_`zone'_`year'_`group''
      }
    }
  }
  replace group ="avg" if group=="Average"
  replace group = "low" if group=="Low"
  replace group="high" if group=="High"

  save $datadir/deltas2013_2019_with_distance_dist_sq.dta, replace

end



*******************************************************************************************
* buildCollegeSample
* 
* Constructs college sample for event-study analysis
*******************************************************************************************
capture program drop buildCollegeSample
program define buildCollegeSample

  * ZOC school list 
  import delimited "$rawdata/zoc_high_schools_list.csv", clear
  tempfile zoc_schools
  save `zoc_schools'

  * Save information related to matched and unmatched samples 
  use $datadir/match_model_estimates_and_output_z_ela_all_dummy.dta , clear
  merge m:1 preferredlocationcode using `zoc_schools'
  merge m:1 preferredlocationcode using $datadir/pweights.dta, gen(mergePweights)
  merge m:1 preferredlocationcode using $datadir/matched_set.dta, gen(mergeMatchedSchools)

  gen analysis_sample= mergeMatchedSchools==3
  collapse (max) analysis_sample, by(preferredlocationcode)
  keep if analysis_sample==1
  drop analysis_sample
  tempfile schools
  save `schools'


  * Use all students - not just those in the VA sample (with 11th grade scores)
  use if gradecode ==11 & endyear>=2008 using  $datadir/lausd2002_2021, clear

  * Fix some issues with school codes and naming (these fixes were applied in main sample construction above)
  replace preferredlocationcode=8829 if regexm(schoollocationname, "ROOSEVELT")
  replace preferredlocationcode=8611 if regexm(schoollocationname, "MENDEZ")
  replace preferredlocationcode=7722 if regexm(schoollocationname, "ANGELOU")
  replace preferredlocationcode=8544 if regexm(schoollocationname, "ROYBAL")
  replace preferredlocationcode=8543 if regexm(schoollocationname, "BELMONT")
  replace preferredlocationcode=8729 if regexm(schoollocationname, "LINCOLN")
  replace preferredlocationcode=8721 if regexm(schoollocationname, "JORDAN")
  merge m:1 preferredlocationcode using `zoc_schools', gen(mergeZOC)
  merge m:1 preferredlocationcode using $datadir/matched_set.dta, gen(mergeMatchedSchools)

  * merge in college enrollment information 
  merge m:1 studentpseudoid using $datadir/lausd_2008_2021_nsc_student_unique.dta, gen(mergeNSC) keep( 1 3)
  merge m:1 preferredlocationcode using `schools', gen(mergeAnalsysisSample)

  gen analysis_schools = mergeAnalsysisSample==3
  * merge in hs grad information 
  merge m:1 studentpseudoid using $datadir/lausd_2008_2021_hs_graduates.dta, gen(mergeHSgraduate) keep(1 3)
  replace graduate_hs = 0 if missing(graduate_hs)

  * create relevant variables for analysis 
  gen went_to_college = mergeNSC==3
  gen four_year = first_college_type =="4-year"
  gen two_year = first_college_type =="2-year"
  gen four_year_or_two = four_year ==1 | two_year ==1
  gen other_college = four_year_or_two==0 & mergeNSC==3
  gen uc_enroll = four_year==1 & first_college_uc==1
  gen private_enroll = four_year==1 & first_college_private==1
  gen outofstate_enroll = four_year==1 & first_out_of_state==1
  gen zoc_hs=zone!=""
  gen b5 = zoc_hs*(endyear==2008)
  gen b4 = zoc_hs*(endyear==2009)
  gen b3 = zoc_hs*(endyear==2010)
  gen b2 = zoc_hs*(endyear==2011)
  *gen b1 = zoc_hs*(endyear==2012)
  gen a0 = zoc_hs*(endyear==2013)
  gen a1 = zoc_hs*(endyear==2014)
  gen a2 = zoc_hs*(endyear==2015)
  gen a3 = zoc_hs*(endyear==2016)
  gen a4 = zoc_hs*(endyear==2017)
  gen a5 = zoc_hs*(endyear==2018)
  gen a6 = zoc_hs*(endyear==2019)
  gen a7 = zoc_hs*(endyear==2020)

  gen hispanic = ethnicity==5
  gen black = ethnicity==3
  tab ethnicity
  gen white = ethnicity==10
  gen zoc = zoc_hs
  replace zoc = 2 if analysis_schools==0 & zoc_hs==0
  replace zoc = 0 if analysis_schools==1 & zoc_hs ==0
  gen four_year_public = four_year ==1 & first_college_private==0

  gen four_year_uc = four_year ==1 & first_college_uc ==1
  gen four_year_nuc_public = four_year ==1 & first_college_uc ==0 & first_college_private ==0
  gen four_year_nuc_private = four_year ==1 & first_college_uc ==0 & first_college_private ==1

  gen post_zoc = (endyear>=2013)*(zone!="")
  local xs "black  english_learner female hispanic migrant college  poverty spanish_at_home"
  foreach x of local xs{
    gen char12_`x' = `x' if endyear == 2012
    bys preferredlocationcode : egen s_mean_`x'_12 = mean(char12_`x')
    gen trend_`x'_1 = s_mean_`x'_12 * endyear
    gen trend_`x'_2 = s_mean_`x'_12^2 * endyear
    gen trend_`x'_3 = s_mean_`x'_12^3 * endyear
    drop char12_`x' s_mean_`x'_12
  }



  replace analysis_schools= 1 if zoc_hs==1

  * Drop schools continuation schools (same restrictions as the main  sample)
  * Drop choice schools (same restrictions as main sample)
  * Analysis focuses on traditional public HS (same restriction as main sample)
  decode preferredlocationname, gen(prefname)
  drop if regexm(prefname, "CDS")
  drop if regexm(prefname, "OPP")
  drop if regexm(prefname, "CONTN")
  drop if regexm(prefname, "CYESIS")
  drop if regexm(prefname, "ANGEL'S GATE HS")
  drop if regexm(prefname, "SP ED")
  replace prefname = proper(prefname )

  gen continuation = 1 if regexm(prefname, "Boyle Heights HS")
  replace continuation = 1 if regexm(prefname, "San Antonio")
  replace continuation = 1 if regexm(prefname, "Stoney Point")
  replace continuation = 1 if regexm(prefname, "Mt Lukens")
  replace continuation = 1 if regexm(prefname, "Highland Park")
  replace continuation = 1 if regexm(prefname, "Cheviot Hills")
  replace continuation = 1 if regexm(prefname, "View Park")
  replace continuation = 1 if regexm(prefname, "Aggeler")
  local schools Boyle Heights ///
          Metropolitan ///
          Monterey ///
          Odyssey ///
          Pueblo ///
          Ramona ///
          Rodia ///
          Addams ///
          Einstein ///
          Grey ///
          Independence ///
          Owensmouth ///
          Thoreau ///
          Wooden ///
          Earhart ///
          Burke ///
          Evergreen ///
          Lewis ///
          London ///
          Mission ///
          Rogers ///        
          Central ///
          Kahlo ///
          Avalon ///
          Hope ///
          Moneta ///
          Patton  ///
          Ellington ///
          Phoenix ///
          Whitman ///
          Young 
  foreach school of local schools{
  replace continuation = 1 if regexm(prefname, "`school'")
  }
  replace continuation= 0 if missing(continuation)
  drop if analysis_schools ==0 & (regexm(prefname , "Mag") | regexm(prefname , "Mg"))

  drop if continuation==1

save $datadir/zoc_college_sample.dta, replace 

end 




capture program drop buildITT 
program define buildITT 
syntax, [data(string)]


    use `data', clear 
    replace zoneid = "NA" if missing(zoneid)
    tempfile temp 
    save `temp'

    use if inrange(gradecode, 6,11) & endyear>=2002 & endyear <=2019 using  $datadir/lausd2002_2021, clear
    duplicates drop studentpseudoid gradecode, force 
    * keep only census and attendance zone boundary info 
    keep studentpseudoid gradecode censusblockid objectid zoneid 
    * reshape to flag address info at each grade in middle school
    reshape wide censusblockid objectid zoneid , i(studentpseudoid) j(gradecode)
    gen middlezoneid = zoneid8 if zoneid8!="NA" & !missing(zoneid8)
    replace middlezoneid = zoneid7 if zoneid7!="NA" & !missing(zoneid7) & missing(middlezoneid)
    replace middlezoneid = zoneid6 if zoneid6!="NA" & !missing(zoneid6) & missing(middlezoneid)

    * same for census blocks 
    decode censusblockid8  , generate(block8)
    decode censusblockid7  , generate(block7)
    decode censusblockid6  , generate(block6)
    gen middlecensusblockid = block8 if  !missing(block8)
    replace middlecensusblockid = block7 if  !missing(block7) & missing(block8)
    replace middlecensusblockid = block6 if !missing(block6) & missing(block7) & missing(block8)

    tempfile address8 
    save `address8'

    * merge back with main data
    use `temp', clear 
    merge m:1 studentpseudoid using `address8', gen(mergeAddress8) keep(1 3)

    replace zoneid8 = "NA" if missing(zoneid8)
    replace middlezoneid = "NA" if missing(middlezoneid)

    * Flagged as living in zoc in middle school
    gen residenceInZone8= middlezoneid !="NA"

    * drop original ATT event-time dummies and convert to middle school address analogs 
    drop b5-a6 

    gen b5 = residenceInZone8*(endyear==2008)
    gen b4 = residenceInZone8*(endyear==2009)
    gen b3 = residenceInZone8*(endyear==2010)
    gen b2 = residenceInZone8*(endyear==2011)
    *gen b1 = zoc_hs*(endyear==2012)
    gen a0 = residenceInZone8*(endyear==2013)
    gen a2 = residenceInZone8*(endyear==2015)
    gen a3 = residenceInZone8*(endyear==2016)
    gen a4 = residenceInZone8*(endyear==2017)
    gen a5 = residenceInZone8*(endyear==2018)
    gen a6 = residenceInZone8*(endyear==2019)

    * drop those with missing test scores (equal exactly zero; same as in main sample)
    drop if z_ela_all==0
    drop if endyear<=2007

    decode censusblockid, gen(blockstr)
    replace middlecensusblockid = blockstr if missing(middlecensusblockid )

    save $datadir/analysis_ela_itt.dta, replace 


end 


capture program drop buildCollegeSampleITT 
program define buildCollegeSampleITT
syntax, [groupfe(string)]

  ******** College Event Study **************
  import delimited "$rawdata/zoc_high_schools_list.csv", clear
  tempfile zoc_schools
  save `zoc_schools'

  use $datadir/match_model_estimates_and_output_z_ela_all_dummy.dta , clear
  merge m:1 preferredlocationcode using `zoc_schools'
  merge m:1 preferredlocationcode using $datadir/pweights.dta, gen(mergePweights)
  merge m:1 preferredlocationcode using $datadir/matched_set.dta, gen(mergeMatchedSchools)
  gen analysis_sample= mergeMatchedSchools==3
  collapse (max) analysis_sample, by(preferredlocationcode)
  keep if analysis_sample==1
  drop analysis_sample
  tempfile schools
  save `schools'

  use if gradecode ==11 & endyear>=2008 & endyear<=2019 using  $datadir/lausd2002_2021, clear

  merge m:1 preferredlocationcode using `zoc_schools', gen(mergeZOC)
  merge m:1 preferredlocationcode using $datadir/matched_set.dta, gen(mergeMatchedSchools)
  * merge in NSC data 
  merge m:1 studentpseudoid using $datadir/lausd_2008_2021_nsc_student_unique.dta, gen(mergeNSC2) keep(1 3)
  merge m:1 preferredlocationcode using `schools', gen(mergeAnalsysisSample)
  gen analysis_schools = mergeAnalsysisSample==3

  * additional variables for analysis 
  gen hispanic = ethnicity==5
  gen black = ethnicity==3
  tab ethnicity
  gen white = ethnicity==10

  * college vars 
  gen went_to_college = mergeNSC==3
  gen four_year = first_college_type =="4-year"
  gen two_year = first_college_type =="2-year"
  gen four_year_or_two = four_year ==1 | two_year ==1
  gen other_college = four_year_or_two==0 & mergeNSC==3
  gen uc_enroll = four_year==1 & first_college_uc==1
  gen private_enroll = four_year==1 & first_college_private==1
  gen outofstate_enroll = four_year==1 & first_out_of_state==1

  gen four_year_public = four_year ==1 & first_college_private==0
  gen four_year_uc = four_year ==1 & first_college_uc ==1
  gen four_year_nuc_public = four_year ==1 & first_college_uc ==0 & first_college_private ==0
  gen four_year_nuc_private = four_year ==1 & first_college_uc ==0 & first_college_private ==1

  tempfile temp 
  save `temp'

  * same address assignment as other ITT sample construction 
  use if inrange(gradecode, 6,11) & endyear>=2002 using  $datadir/lausd2002_2021, clear
  duplicates drop studentpseudoid gradecode, force 
  keep studentpseudoid gradecode censusblockid objectid zoneid 
  reshape wide censusblockid objectid zoneid , i(studentpseudoid) j(gradecode)
  
  gen middlezoneid = zoneid8 if zoneid8!="NA" & !missing(zoneid8)
  replace middlezoneid = zoneid7 if zoneid7!="NA" & !missing(zoneid7) & missing(middlezoneid)
  replace middlezoneid = zoneid6 if zoneid6!="NA" & !missing(zoneid6) & missing(middlezoneid)
  * same for census blocks 
  decode censusblockid8  , generate(block8)
  decode censusblockid7  , generate(block7)
  decode censusblockid6  , generate(block6)
  gen middlecensusblockid = block8 if  !missing(block8)
  replace middlecensusblockid = block7 if  !missing(block7) & missing(block8)
  replace middlecensusblockid = block6 if !missing(block6) & missing(block7) & missing(block8)

  tempfile address8 
  save `address8'

  use `temp', clear 
  merge m:1 studentpseudoid using `address8', gen(mergeAddress8) keep(1 3)

  replace zoneid8 = "NA" if missing(zoneid8)
  replace middlezoneid = "NA" if missing(middlezoneid)


  * Flagged as living in zoc in middle school
  gen residenceInZone8= middlezoneid !="NA"
    
  gen b5_2 = residenceInZone8*(endyear==2008)
  gen b4_2 = residenceInZone8*(endyear==2009)
  gen b3_2 = residenceInZone8*(endyear==2010)
  gen b2_2 = residenceInZone8*(endyear==2011)
  gen a0_2 = residenceInZone8*(endyear==2013)
  gen a1_2 = residenceInZone8*(endyear==2014)
  gen a2_2 = residenceInZone8*(endyear==2015)
  gen a3_2 = residenceInZone8*(endyear==2016)
  gen a4_2 = residenceInZone8*(endyear==2017)
  gen a5_2 = residenceInZone8*(endyear==2018)
  gen a6_2 = residenceInZone8*(endyear==2019)

  decode censusblockid, gen(blockstr)
  replace middlecensusblockid = blockstr if missing(middlecensusblockid )
  

  * Drop continuation schools
  * Drop choice schools 
  * Analysis focuses on traditional public HS
  decode preferredlocationname, gen(prefname)
  drop if regexm(prefname, "CDS")
  drop if regexm(prefname, "OPP")
  drop if regexm(prefname, "CONTN")
  drop if regexm(prefname, "CYESIS")
  drop if regexm(prefname, "ANGEL'S GATE HS")
  drop if regexm(prefname, "SP ED")
  replace prefname = proper(prefname )

  gen continuation = 1 if regexm(prefname, "Boyle Heights HS")
  replace continuation = 1 if regexm(prefname, "San Antonio")
  replace continuation = 1 if regexm(prefname, "Stoney Point")
  replace continuation = 1 if regexm(prefname, "Mt Lukens")
  replace continuation = 1 if regexm(prefname, "Highland Park")
  replace continuation = 1 if regexm(prefname, "Cheviot Hills")
  replace continuation = 1 if regexm(prefname, "View Park")
  replace continuation = 1 if regexm(prefname, "Aggeler")
  local schools Boyle Heights ///
          Metropolitan ///
          Monterey ///
          Odyssey ///
          Pueblo ///
          Ramona ///
          Rodia ///
          Addams ///
          Einstein ///
          Grey ///
          Independence ///
          Owensmouth ///
          Thoreau ///
          Wooden ///
          Earhart ///
          Burke ///
          Evergreen ///
          Lewis ///
          London ///
          Mission ///
          Rogers ///        
          Central ///
          Kahlo ///
          Avalon ///
          Hope ///
          Moneta ///
          Patton  ///
          Ellington ///
          Phoenix ///
          Whitman ///
          Young 
  foreach school of local schools{
    replace continuation = 1 if regexm(prefname, "`school'")
  }
  replace continuation= 0 if missing(continuation)
  drop if analysis_schools ==0 & (regexm(prefname , "Mag") | regexm(prefname , "Mg"))

  drop if continuation==1

  save $datadir/college_sample_itt.dta, replace 

end 



****************************************************************************
* cleanTeacherData
* 
* Description: cleans teacher data 
****************************************************************************
capture program drop cleanTeacherData
program define cleanTeacherData
  clear 
  tempfile master
  save `master', emptyok
  forvalues year = 2008/2018{
    import excel "$rawdata/teachers`year'.xlsx", sheet("Sheet1") firstrow case(lower) clear
    replace age = round(age)
    format teacherpseudo %20.0g
    gen endyear = `year'
    append using `master'
    save `master', replace
  }

  replace employmentstatustext = f if endyear==2013
  replace employmentstatustext = es if endyear==2014
  drop es f
  replace ethnicity = etnicity if endyear==2012
  drop etnicity

  replace tlyearsofexp = "10" if tlyearsofexp=="10+"
  replace tlyearsofexp = "" if tlyearsofexp=="-"
  destring tlyearsofexp, replace
  replace tlyearsofexp = 0 if tlyearsofexp==.
  gen exp0_2 = inrange(tlyearsofexp,0,2)
  gen exp3_5 = inrange(tlyearsofexp,3,5)
  gen exp6_9 = inrange(tlyearsofexp,6,9)
  gen exp10 = tlyearsofexp >=10
  gen tchr_asian = ethnicity=="Asian" | ethnicity =="Filipino"
  gen tchr_black = ethnicity=="Black"
  gen tchr_latino = ethnicity=="Latino"
  gen tchr_other = tchr_asian ==0 & tchr_black ==0 & tchr_latino ==0
  drop ethnicity

  gen bachelor = highestdegree =="B"
  gen masters = highestdegree =="M"
  gen doctorate = highestdegree =="D"
  drop highestdegree

  gen female = genderkey=="Female"
  drop genderkey
  rename teacherpseudo teacherpsuedoid
  save $datadir/employees2008_2018.dta, replace
end


****************************************************************************
* schoolAttributeAggregation
* 
* Description: School-level attributes used in preference analysis later
****************************************************************************
capture program drop schoolAttributeAggregation
program define schoolAttributeAggregation

  clear 
  tempfile master
  save `master' , emptyok
  set seed 12345
  set sortseed 12345

  forvalues year= 2002/2018{
    import delimited "$grades/grades`year'.csv", clear
    tab departmentname
    gen mathematics = departmentname=="MATHEMATICS"
    gen english = departmentname=="ENGLISH" | departmentname=="READING"
    gen cs = departmentname=="COMPUTER SCIENCE"
    gen science = departmentname=="SCIENCE" | departmentname=="ENGINEERING"
    gen social = departmentname=="SOCIAL SCIENCE"
    gen art = departmentname=="VISUAL ART" | departmentname=="THEATER ARTS" | departmentname=="MEDIA ARTS" | departmentname=="DANCE" | departmentname=="MUSIC"
    gen pe = departmentname=="PHYSICAL EDUCATION"
    gen ap = apcourseflag=="Y"
    gen honors = honor_course_flag =="Y"
    format teacherpsuedoid  %20.0g

    * collapse to teacher-school-year
    gen obs = 1
    drop if preferredlocationcode=="UNKNOWN"
    collapse (max) mathematics english cs science social art pe ap honors (count) obs, by(teacherpsuedoid  preferredlocationcode endyear )
    drop if preferredlocationcode=="UNKNOWN"
    gen other = mathematics==0 & english==0 & cs==0 & science==0 & social==0 & art==0 & pe==0
    * Ensure each teacher is assigned to only one school (the one they teach at the most)
    bys teacherpsuedoid endyear: egen maxStudents = max(obs)
    keep if obs == maxStudents
    sort teacherpsuedoid endyear obs
    bys teacherpsuedoid endyear: keep if _n==_N
    drop obs
    * collapse to school level
    append using `master'
    save `master', replace
    di "`year'"
  }

  * Merge employee information
  isid teacherpsuedoid endyear
  * Note that elementary school teachers not in Gr6-12 transcripts (drop merge==1)
  merge m:1 teacherpsuedoid endyear using $datadir/employees2008_2018.dta, gen(mergeEmployeeInfo) keep(1 3)
  drop if mergeEmployeeInfo==1
  drop mergeEmployeeInfo

  * keep teachers (93 percent of obs)
  * drops ROP, principals, and counselors that assigned grades in some way
  keep if regexm(jobtext , "TEACHER") | regexm(jobtext, "TCHR")
  save $datadir/teachers_2002_2018.dta, replace
  destring preferredlocationcode, replace
  collapse (mean) mathematics english cs science social art pe ap honors ///
          age exp0_2 exp3_5 exp6_9 exp10 ///
          tchr_latino tchr_black tchr_asian tchr_other ///
          bachelor masters doctorate female ///
       (sum) num_math= mathematics  num_english = english num_cs = cs ///
            num_science = science num_social = social num_art =art ///
            num_pe = pe num_ap = ap num_honors = honors ///
            num_exp0_2 = exp0_2 num_exp10 = exp10 num_latino = tchr_latino ///
            num_black = tchr_black num_asian = tchr_asian ///
            num_other = tchr_other ///
            num_bachelor = bachelor num_masters = masters ///
            num_doctor = doctorate num_female = female ///
         (count) teacherpsuedoid, ///
         by(preferredlocationcode endyear)
  save $datadir/school_attributes.dta, replace


end


****************************************************************************
* constructBehavior
*
* Description: intermediate outcomes used later in analysis 
****************************************************************************
capture program drop constructBehavior 
program define constructBehavior


  use if inrange(gradecode, 9,12) using $datadir/lausd2002_2021.dta, clear 

  gen tookSAT = (!missing(sat_math)) | (!missing(sat_write)) | (!missing(sat_verbal))
  gen college_complete_status = agstatusdescCSU == "COMPLETED" | agstatusdescUC == "COMPLETED" 
  gen csu_complete_status = agstatusdescCSU == "COMPLETED" 
  gen college_met_status = agstatustypecodeUC =="MET" | agstatustypecodeCSU=="MET" 
  gen csu_met_status = agstatustypecodeCSU=="MET" 
  gen uc_met_status = agstatustypecodeUC =="MET" 

  replace ytd_honors = 0 if missing(ytd_honors)
  replace ytd_ap = 0 if missing(ytd_ap)
  gen ap_honors = ytd_ap + ytd_honors
  egen ap_math = rowtotal(numAPCoursesFALL_MATH numAPCoursesSPRING_MATH )
  egen ap_ela = rowtotal(numAPCoursesFALL_ELA numAPCoursesSPRING_ELA )
  collapse (sum) total_suspension_days = ofsuspendeddays ///
                total_incidents = incidents ///
                total_ap = ytd_ap ///
                total_honors = ytd_honors ///
                total_ap_honors = ap_honors ///
                total_uc_credits = agcreditscompleteUC ///
                total_csu_credits = agcreditscompleteCSU ///
                total_days_attended = ytdattendeddays ///
                total_days_absent = ytdofabsentdays ///
                total_days_enrolled = ytdenrolleddays  ///
                ap_ela ap_math ///
                numSAT_took = tookSAT ///
                (max) tookSAT college_complete_status college_met_status csu_complete_status ///
                  max_math_sat = sat_math max_write_sate = sat_write max_verbal_sat = sat_verbal ///
                  uc_met_status csu_met_status, by(studentpseudoid)

  gen total_attendance_rate = total_days_attended/total_days_enrolled

  save ${datadir}/student_behavior_data.dta, replace 
end 

main


timer off 1 
timer list 1 



